package net.nutch.util;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.ArrayList;
import net.nutch.fetcher.Outlink;
import org.w3c.dom.*;
/**
* A collection of methods for extracting content from DOM trees.
*
* This class holds a few utility methods for pulling content out of
* DOM nodes, such as getOutlinks, getText, etc.
*
*/
public class DOMContentUtils {
/**
* This method takes a {@link StringBuffer} and a DOM {@link Node},
* and will append all the content text found beneath the DOM node to
* the <code>StringBuffer</code>.
*
* <p>
*
* If <code>abortOnNestedAnchors</code> is true, DOM traversal will
* be aborted and the <code>StringBuffer</code> will not contain
* any text encountered after a nested anchor is found.
*
* <p>
*
* Currently, only SCRIPT, STYLE and comment text are ignored.
*
* @return true if nested anchors were found
*/
public static final boolean getText(StringBuffer sb, Node node,
boolean abortOnNestedAnchors) {
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
return true;
}
return false;
}
/**
* This is a convinience method, equivalent to {@link
* #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
*
*/
public static final void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
}
// returns true if abortOnNestedAnchors is true and we find nested
// anchors
private static final boolean getTextHelper(StringBuffer sb, Node node,
boolean abortOnNestedAnchors,
int anchorDepth) {
if ("script".equalsIgnoreCase(node.getNodeName())) {
return false;
}
if ("style".equalsIgnoreCase(node.getNodeName())) {
return false;
}
if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) {
anchorDepth++;
if (anchorDepth > 1)
return true;
}
if (node.getNodeType() == Node.COMMENT_NODE) {
return false;
}
if (node.getNodeType() == Node.TEXT_NODE) {
sb.append(node.getNodeValue());
}
boolean abort= false;
NodeList children = node.getChildNodes();
if ( children != null ) {
int len = children.getLength();
for ( int i = 0; i < len; i++ ) {
if (getTextHelper(sb, children.item(i),
abortOnNestedAnchors, anchorDepth)) {
abort= true;
break;
}
}
}
return abort;
}
/**
* This method takes a {@link StringBuffer} and a DOM {@link Node},
* and will append the content text found beneath the first
* <code>title</code> node to the <code>StringBuffer</code>.
*
* @return true if a title node was found, false otherwise
*/
public static final boolean getTitle(StringBuffer sb, Node node) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("title".equalsIgnoreCase(node.getNodeName())) {
getText(sb, node);
return true;
}
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
if (getTitle(sb, children.item(i))) {
return true;
}
}
}
return false;
}
private static boolean hasOnlyWhiteSpace(Node node) {
String val= node.getNodeValue();
for (int i= 0; i < val.length(); i++) {
if (!Character.isWhitespace(val.charAt(i)))
return false;
}
return true;
}
// this only covers a few cases of empty links that are symptomatic
// of nekohtml's DOM-fixup process...
private static boolean shouldThrowAwayLink(Node node, NodeList children,
int childLen) {
if (node.getNodeName().equalsIgnoreCase("area")) {
return false;
}
if (childLen == 0) {
// this has no inner structure
return true;
} else if ((childLen == 1)
&& (children.item(0).getNodeType() == Node.ELEMENT_NODE)
&& ("a".equalsIgnoreCase(children.item(0).getNodeName()))) {
// single nested link
return true;
} else if (childLen == 2) {
Node c0= children.item(0);
Node c1= children.item(1);
if ((c0.getNodeType() == Node.ELEMENT_NODE)
&& ("a".equalsIgnoreCase(c0.getNodeName()))
&& (c1.getNodeType() == Node.TEXT_NODE)
&& hasOnlyWhiteSpace(c1) ) {
// single link followed by whitespace node
return true;
}
if ((c1.getNodeType() == Node.ELEMENT_NODE)
&& ("a".equalsIgnoreCase(c1.getNodeName()))
&& (c0.getNodeType() == Node.TEXT_NODE)
&& hasOnlyWhiteSpace(c0) ) {
// whitespace node followed by single link
return true;
}
} else if (childLen == 3) {
Node c0= children.item(0);
Node c1= children.item(1);
Node c2= children.item(2);
if ((c1.getNodeType() == Node.ELEMENT_NODE)
&& ("a".equalsIgnoreCase(c1.getNodeName()))
&& (c0.getNodeType() == Node.TEXT_NODE)
&& (c2.getNodeType() == Node.TEXT_NODE)
&& hasOnlyWhiteSpace(c0)
&& hasOnlyWhiteSpace(c2) ) {
// single link surrounded by whitespace nodes
return true;
}
}
return false;
}
/**
* This method finds all anchors below the supplied DOM
* <code>node</code>, and creates appropriate {@link Outlink}
* records for each (relative to the supplied <code>base</code>
* URL), and adds them to the <code>outlinks</code> {@link
* ArrayList}.
*
* <p>
*
* Links without inner structure (tags, text, etc) are discarded, as
* are links which contain only single nested links and empty text
* nodes (this is a common DOM-fixup artifact, at least with
* nekohtml).
*/
public static final void getOutlinks(URL base, ArrayList outlinks,
Node node) {
NodeList children = node.getChildNodes();
int childLen= 0;
if (children != null)
childLen= children.getLength();
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("a".equalsIgnoreCase(node.getNodeName()) ||
"area".equalsIgnoreCase(node.getNodeName())) {
if (shouldThrowAwayLink(node, children, childLen)) {
// this has no inner structure or just a single nested
// anchor-- toss it!
} else {
StringBuffer linkText = new StringBuffer();
getText(linkText, node, true);
NamedNodeMap attrs = node.getAttributes();
String target= null;
for (int i= 0; i < attrs.getLength(); i++ ) {
if ("href".equalsIgnoreCase(attrs.item(i).getNodeName())) {
target= attrs.item(i).getNodeValue();
break;
}
}
if (target != null)
try {
URL url = new URL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim()));
} catch (MalformedURLException e) {
// don't care
}
}
}
}
for ( int i = 0; i < childLen; i++ ) {
getOutlinks(base, outlinks, children.item(i));
}
}
}